keep = ls()

###############################################
#Census 1860 All Indiana: Clean for prediction#
###############################################

census_1860_all = fread(paste0(ipums_census_path,'/indiana_1860.txt')) %>%
  .[ResidenceCounty %in%  c("Bartholomew", 'Boone', "Johnson", "Hamilton",
                            "Hendricks", "Henry", "Montgomery",
                            "Morgan", "Vermillion")] %>%
  .[, ResidenceAge := as.numeric(ResidenceAge)] %>%
  .[is.na(ResidenceAge), ResidenceAge := 0] %>%
  .[, hh_head := (1:.N) == 1 , by = FamilyID] %>%
  .[, hh_head_occupation := Occupation[1], by = FamilyID] %>%
  .[, hh_kids := ResidenceAge %>% `%in%` (19:100) %>% `!` %>% sum(), by = FamilyID] %>%
  .[, hh_kids_school := sum(AttendedSchool[ResidenceAge %in% 5:18] == 'Yes'), by = FamilyID] %>%
  .[, hh_comp_str := paste(paste(Gender, ResidenceAge, sep = "_"), collapse = " "), by = FamilyID] %>%
  .[, hh_real_estate := RealEstateValue %>% as.numeric %>% sum(na.rm = T), by = FamilyID] %>%
  .[, hh_personal_estate := PersonalEstateValue %>% as.numeric %>% sum(na.rm = T), by = FamilyID] %>%
  .[, married := (Gender %in% 'Male') &
      (ResidenceAge >= 18) &
      shift(Gender, type = 'lead') %in% "Female" &
      shift(Surname,  type = 'lead') == Surname &
      shift(ResidenceAge,  type = 'lead') >= 18 &
      ((ResidenceAge - shift(ResidenceAge, type = 'lead')) %in% -16:16)
    , by = FamilyID] %>%
  .[Gender %in% "Male" &
      Race %in% c("White", "") &
      as.numeric(ResidenceAge) > 10]

#Clean Names
census_1860_all[, paste("match", 
                    c("first", "middle", "last", "first_clean"), 
                    sep = "_") := clean_names(first = Given,
                                              middle = NULL,
                                              last = Surname
                    )]

#Birth Year
census_1860_all[, birth_year := 1860 - as.numeric(ResidenceAge)]


#Census covariate data
census_1860_match = census_1860_all[, list(mpcid, 
                                       census_county = ResidenceCounty,
                                       census_city = ResidenceCity,
                                       census_po = PostOffice,
                                       match_first,
                                       match_middle,
                                       match_last,
                                       match_first_clean,
                                       first_sound = metaphone(match_first),
                                       last_sound = metaphone(match_last),
                                       birth_year,
                                       real_estate = RealEstateValue,
                                       personal_estate = PersonalEstateValue,
                                       birth_place = BirthPlace,
                                       occupation = Occupation %>% 
                                         str_replace("[^A-Za-z &]", "") %>%
                                         str_replace(' ', ''),
                                       school = AttendedSchool,
                                       illiterate = CannotRead,
                                       disability = DisabilityCondition,
                                       hh_head,
                                       hh_head_occupation = hh_head_occupation  %>% 
                                         str_replace("[^A-Za-z &]", "") %>%
                                         str_replace(' ', '') %>%
                                         str_to_lower(),
                                       hh_kids,
                                       hh_kids_school,
                                       hh_comp_str,
                                       hh_real_estate,
                                       hh_personal_estate,
                                       married
)
]


#add in clean birth places
census_bp = fread('./raw/census_1860_in_birth_places.csv')
setkey(census_1860_match, birth_place)
setkey(census_bp, birth_place)
census_1860_match = census_bp[census_1860_match]

#Add in enlisted label
cwdb_census_links = fread('./cleaned/cwdb_to_census_links.csv')
census_1860_match[, enlisted := mpcid %in% cwdb_census_links$mpcid]

#Save covariates for matched-to-soldiers
fwrite(census_1860_match[(enlisted)], './cleaned/census_indiana_covariates.csv')

##########################################
#Cleanup##################################
##########################################
rm(list = setdiff(ls(), c(keep, 'keep')))
gc()
